View Javadoc

1   
2   /*
3    * SmartCrawler
4    *
5    * $Id: DownloadEngine.java,v 1.8 2005/08/05 14:06:33 vincool Exp $
6    * Copyright 2005 Davide Pozza
7    *
8    * This program is free software; you can redistribute it
9    * and/or modify it under the terms of the GNU General Public
10   * License as published by the Free Software Foundation;
11   * either version 2 of the License, or (at your option) any
12   * later version.
13   *
14   * This program is distributed in the hope that it will be
15   * useful, but WITHOUT ANY WARRANTY; without even the implied
16   * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
17   * PURPOSE. See the GNU General Public License for more
18   * details.
19   *
20   * You should have received a copy of the GNU General Public
21   * License along with this program; if not, write to the Free
22   * Software Foundation, Inc., 59 Temple Place, Suite 330,
23   * Boston, MA 02111-1307 USA
24   *
25   */
26  
27  package org.smartcrawler;
28  
29  import java.util.Collection;
30  import java.util.Date;
31  import org.apache.log4j.Logger;
32  import org.smartcrawler.common.Context;
33  import org.smartcrawler.extractor.LinksExtractor;
34  import org.smartcrawler.extractor.RegExpLinksExtractor;
35  import org.smartcrawler.filter.FilterManager;
36  import org.smartcrawler.persistence.Persister;
37  import org.smartcrawler.common.Link;
38  import org.smartcrawler.common.Provider;
39  import org.smartcrawler.common.ProviderFactory;
40  import org.smartcrawler.common.SCLogger;
41  import org.smartcrawler.retriever.Content;
42  import org.smartcrawler.retriever.HttpCall;
43  import org.smartcrawler.retriever.Response;
44  import org.smartcrawler.retriever.Retriever;
45  
46  /***
47   * The engine thread which is started by the {@link org.smartcrawler.Crawler}
48   *
49   * @author <a href="mailto:pozzad@alice.it">Davide Pozza</a>
50   * @version <tt>$Revision: 1.8 $</tt>
51   */
52  public class DownloadEngine extends Thread {
53  
54      private Provider linksProv;
55      private Context conf;
56      private static Logger log = SCLogger.getLogger(DownloadEngine.class);
57      private static Logger logCons = SCLogger.getConsoleLogger();
58      private Retriever retriever;
59      private FilterManager fMan;
60      private Persister persister;
61      /***
62       * Creates a new instance of Engine
63       *
64       * @param conf The the {@link org.smartcrawler.common.SiteConfiguration}
65       * @param retriever The supplied {@link org.smartcrawler.retriever.Retriever}
66       */
67      public DownloadEngine(Context conf) {
68          this.conf = conf;
69          this.retriever = this.conf.getRetriever();
70          this.persister = this.conf.getPersister();
71          this.linksProv = ProviderFactory.instance().create();
72          Collection cPrec = conf.getPrecFiltersList();
73          Collection cPost = conf.getPostFiltersList();
74          log.debug("DownloadEngine() cPrec.size()="+cPrec.size()+
75                  " cPost.size()="+cPost.size());
76          this.fMan = new FilterManager();
77          this.fMan.addPrecFilters(cPrec);
78          this.fMan.addPostFilters(cPost);
79      }
80  
81      /***
82       *
83       *  The main method of the thread. While the {@link org.smartcrawler.common.LinksProvider}
84       *  contains a link to process, the engine retrieves and fetches it.
85       */
86      public void run() {
87          log.debug("run(): BEGIN");
88          long startTime = (new Date()).getTime();
89  
90          log.info("Started.");
91          logCons.info("Started.");
92          while (!linksProv.isEmpty()) {
93              if (linksProv.size() > 0) {
94                  //1) get next link obj from queue
95                  Link link = linksProv.next();
96                  if (link == null){
97                      continue;
98                  }
99                  //2) download the content
100                 log.debug(" Processing " + link);
101                 HttpCall call = new HttpCall(link);
102                 Response result = retriever.execute(call);
103                 if (result.isRedirected()) {
104                     log.info("The link " +
105                             link + " redirects to " + result.getRedirection());
106                     logCons.info("The link " +
107                             link + " redirects to " + result.getRedirection());
108                     if (this.fMan.isPermitted(this.conf,
109                             result.getRedirection())) {
110                         linksProv.store(result.getRedirection());
111                     }
112                 }
113 
114                 if (result.isFound()) {
115                     Content content = result.getContent();
116                     if (this.fMan.isPermitted(this.conf, content)) {
117                         //3) Persist it
118                         persister.persist(content);
119                     }
120 
121                     //4) extract the links
122                     LinksExtractor extractor = new RegExpLinksExtractor(link);
123                     Link[] links = extractor.extract(content);
124 
125                     //3) put the links into the queue
126                     for (Link newLink : links) {
127 
128                         //4) check by the permission handler
129                         if (this.fMan.isPermitted(this.conf, newLink)) {
130                             linksProv.store(newLink);
131                         }
132                     }
133                 }
134                 linksProv.confirm(link);
135             }
136 
137         }
138         long endTime = (new Date()).getTime();
139         long totTimeMins = (endTime - startTime)/1000;
140 
141         log.debug("run(): END");
142         logCons.info("Shutted down [elapsed time: "
143                 + totTimeMins + "sec.].");
144     }
145 }